Assignment1

Author

Xinran Wang

Settings

suppressPackageStartupMessages(library(knitr))

suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(datasauRus))

suppressPackageStartupMessages(library(skimr))

suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(ggpubr))

suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(leaflet.extras2))
idir <- "/Users/xinranwang/Documents/Course/25Fall/PM566/Data"

EDA

  1. (30 points) Given the formulated question from the assignment description, you will now conduct EDA Checklist items 1-5. First, download 2002 and 2022 data for all sites in California from the EPA Air Quality Data website, then read the data into R. For each of the two datasets, check the dimensions, headers, footers, variable names and variable types. Check the distribution of the key variable we are analyzing (PM). Write up a summary of all of your findings.

    Note. Since skim collapse all number into numeric, I also attached a table to distinguish between numerical types

    Summary. This data frame includes measurements at the state level (California), county level (51 counties), and city level (199 monitoring sites). PM2.5 was recorded using 21 different methods (see Method.Description for details) and reported in units of µg/m3 LC.

    read_df <- function(idir, filename, year){
      df <- read.csv(paste0(idir, "/", filename)) %>% mutate(Year = year)
      return(df)
    }
    check_df <- function(df){
      print(paste0("Dimension: ", paste(dim(df), collapse = " x "),
                   "; # Rows: ", nrow(df), 
                   "; # Columns: ", ncol(df)))
    
      df_type <- data.frame(Type = sapply(df, function(x) paste(class(x), collapse = ", ")))
      kable(df_type, caption = "Variable types in df")
    }

    2002

    df_2002 <- read_df(idir, "ad_viz_plotval_data_PM2.5_2002.csv", "2002")
    check_df(df_2002)
    [1] "Dimension: 15976 x 23; # Rows: 15976; # Columns: 23"
    Variable types in df
    Type
    Date character
    Source character
    Site.ID integer
    POC integer
    Daily.Mean.PM2.5.Concentration numeric
    Units character
    Daily.AQI.Value integer
    Local.Site.Name character
    Daily.Obs.Count integer
    Percent.Complete numeric
    AQS.Parameter.Code integer
    AQS.Parameter.Description character
    Method.Code integer
    Method.Description character
    CBSA.Code integer
    CBSA.Name character
    State.FIPS.Code integer
    State character
    County.FIPS.Code integer
    County character
    Site.Latitude numeric
    Site.Longitude numeric
    Year character
    skim(df_2002)
    Data summary
    Name df_2002
    Number of rows 15976
    Number of columns 23
    _______________________
    Column type frequency:
    character 10
    numeric 13
    ________________________
    Group variables None

    Variable type: character

    skim_variable n_missing complete_rate min max empty n_unique whitespace
    Date 0 1 10 10 0 365 0
    Source 0 1 3 3 0 1 0
    Units 0 1 8 8 0 1 0
    Local.Site.Name 0 1 0 43 118 103 0
    AQS.Parameter.Description 0 1 24 38 0 2 0
    Method.Description 0 1 28 62 0 6 0
    CBSA.Name 0 1 0 45 929 31 0
    State 0 1 10 10 0 1 0
    County 0 1 4 15 0 48 0
    Year 0 1 4 4 0 1 0

    Variable type: numeric

    skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
    Site.ID 0 1.00 60549600.12 289945.67 60010007.00 60290014.00 60590007.00 60731002.00 61131003.00 ▃▇▅▆▃
    POC 0 1.00 1.58 1.25 1.00 1.00 1.00 1.00 6.00 ▇▁▁▁▁
    Daily.Mean.PM2.5.Concentration 0 1.00 16.12 13.87 0.00 7.00 12.00 20.50 104.30 ▇▂▁▁▁
    Daily.AQI.Value 0 1.00 59.28 31.95 0.00 39.00 56.00 72.00 185.00 ▃▇▂▁▁
    Daily.Obs.Count 0 1.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
    Percent.Complete 0 1.00 100.00 0.00 100.00 100.00 100.00 100.00 100.00 ▁▁▇▁▁
    AQS.Parameter.Code 0 1.00 88215.26 181.01 88101.00 88101.00 88101.00 88502.00 88502.00 ▇▁▁▁▃
    Method.Code 0 1.00 297.04 282.45 117.00 120.00 120.00 707.00 810.00 ▇▁▁▁▃
    CBSA.Code 929 0.94 33269.99 11059.59 12540.00 23420.00 40140.00 41740.00 49700.00 ▃▂▃▇▁
    State.FIPS.Code 0 1.00 6.00 0.00 6.00 6.00 6.00 6.00 6.00 ▁▁▇▁▁
    County.FIPS.Code 0 1.00 54.78 28.98 1.00 29.00 59.00 73.00 113.00 ▃▇▅▆▃
    Site.Latitude 0 1.00 36.00 2.28 32.63 34.07 35.36 37.77 41.71 ▇▃▅▃▁
    Site.Longitude 0 1.00 -119.38 1.95 -124.16 -121.37 -119.06 -117.87 -115.48 ▁▇▇▇▅
    kable(head(df_2002, 1), caption = "Head of df")
    Head of df
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    01/05/2002 AQS 60010007 1 25.1 ug/m3 LC 81 Livermore 1 100 88101 PM2.5 - Local Conditions 120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2002
    kable(tail(df_2002, 1), caption = "Tail of df")
    Tail of df
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    15976 12/31/2002 AQS 61131003 1 6 ug/m3 LC 33 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 117 R & P Model 2000 PM2.5 Sampler w/WINS 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2002

    2022

    df_2022 <- read_df(idir, "ad_viz_plotval_data_PM2.5_2022.csv", "2022")
    check_df(df_2022)
    [1] "Dimension: 59918 x 23; # Rows: 59918; # Columns: 23"
    Variable types in df
    Type
    Date character
    Source character
    Site.ID integer
    POC integer
    Daily.Mean.PM2.5.Concentration numeric
    Units character
    Daily.AQI.Value integer
    Local.Site.Name character
    Daily.Obs.Count integer
    Percent.Complete numeric
    AQS.Parameter.Code integer
    AQS.Parameter.Description character
    Method.Code integer
    Method.Description character
    CBSA.Code integer
    CBSA.Name character
    State.FIPS.Code integer
    State character
    County.FIPS.Code integer
    County character
    Site.Latitude numeric
    Site.Longitude numeric
    Year character
    skim(df_2022)
    Data summary
    Name df_2022
    Number of rows 59918
    Number of columns 23
    _______________________
    Column type frequency:
    character 10
    numeric 13
    ________________________
    Group variables None

    Variable type: character

    skim_variable n_missing complete_rate min max empty n_unique whitespace
    Date 0 1 10 10 0 365 0
    Source 0 1 3 3 0 1 0
    Units 0 1 8 8 0 1 0
    Local.Site.Name 0 1 3 42 0 165 0
    AQS.Parameter.Description 0 1 24 38 0 2 0
    Method.Description 0 1 24 62 0 18 0
    CBSA.Name 0 1 0 45 4569 33 0
    State 0 1 10 10 0 1 0
    County 0 1 4 15 0 50 0
    Year 0 1 4 4 0 1 0

    Variable type: numeric

    skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
    Site.ID 0 1.00 60564224.91 290630.89 60010007.00 60290019.00 60631006.00 60731026.00 61131003.00 ▅▇▇▇▃
    POC 0 1.00 3.77 4.87 1.00 1.00 3.00 3.00 24.00 ▇▁▁▁▁
    Daily.Mean.PM2.5.Concentration 0 1.00 8.41 7.64 -6.70 4.10 6.80 10.70 302.50 ▇▁▁▁▁
    Daily.AQI.Value 0 1.00 39.22 21.55 0.00 23.00 38.00 54.00 454.00 ▇▁▁▁▁
    Daily.Obs.Count 0 1.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
    Percent.Complete 0 1.00 100.00 0.00 100.00 100.00 100.00 100.00 100.00 ▁▁▇▁▁
    AQS.Parameter.Code 0 1.00 88192.43 168.24 88101.00 88101.00 88101.00 88101.00 88502.00 ▇▁▁▁▂
    Method.Code 0 1.00 337.99 253.07 143.00 170.00 170.00 707.00 810.00 ▇▁▁▁▃
    CBSA.Code 4569 0.92 34970.64 10229.06 12540.00 31080.00 40140.00 41860.00 49700.00 ▂▂▂▇▂
    State.FIPS.Code 0 1.00 6.00 0.00 6.00 6.00 6.00 6.00 6.00 ▁▁▇▁▁
    County.FIPS.Code 0 1.00 56.28 29.06 1.00 29.00 63.00 73.00 113.00 ▅▇▇▇▃
    Site.Latitude 0 1.00 36.25 2.31 32.58 34.07 36.49 37.96 41.76 ▇▃▇▅▁
    Site.Longitude 0 1.00 -119.57 2.04 -124.20 -121.37 -119.59 -117.94 -115.48 ▁▇▆▇▃
    kable(head(df_2022, 1), caption = "Head of df")
    Head of df
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    01/01/2022 AQS 60010007 3 12.7 ug/m3 LC 58 Livermore 1 100 88101 PM2.5 - Local Conditions 170 Met One BAM-1020 Mass Monitor w/VSCC 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2022
    kable(tail(df_2022, 1), caption = "Tail of df")
    Tail of df
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    59918 12/31/2022 AQS 61131003 1 1 ug/m3 LC 6 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 145 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2022
    identical(colnames(df_2002), colnames(df_2022))
    [1] TRUE

    Note. Here, I combined data frames as they have identical column names.

    df <- rbind(df_2002, df_2022)
    rm(df_2002, df_2022)

    Distribution of PM 2.5

    Summary. There is a striking decrease in daily PM2.5 concentrations from 2002 to 2022. Mean and median PM 2.5 were cut in half. However, there are more outliers in 2022 and max PM 2.5 increased by 3 fold

    df <- df %>% rename(PM2.5 = Daily.Mean.PM2.5.Concentration) 
    df$Year <- factor(df$Year, levels = c("2002", "2022"))
    
    summary1 <- df %>% 
      group_by(Year) %>%
      summarise(
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = TRUE), 2),
        min           = min(PM2.5, na.rm = TRUE),
        q25           = quantile(PM2.5, 0.25, na.rm = TRUE),
        median        = median(PM2.5, na.rm = TRUE),
        q75           = quantile(PM2.5, 0.75, na.rm = TRUE),
        max           = max(PM2.5, na.rm = TRUE)
      ) 
    kable(as.data.frame(summary1), align = "c")
    Year mean sd min q25 median q75 max
    2002 16.12 13.87 0.0 7.0 12.0 20.5 104.3
    2022 8.41 7.64 -6.7 4.1 6.8 10.7 302.5
  2. (10 points) Combine the two years of data into one data frame. Use the Date variable to create a new column for year, which will serve as an identifier. Change the names of the key variables so that they are easier to refer to in your code.

    Note. (1) I’ve created new column for year when reading data, (2) I’ve renamed Daily.Mean.PM2.5.Concentration to PM2.5 in previous step (3) I’ve combined data in previous step (before plotting).

  3. (20 points) Create a basic map (or maps) in leaflet showing the locations of the monitoring sites, using different colors for each year. Summarize the spatial distribution of the sites. How does this distribution change from 2002 to 2022?

    Map (2002 vs 2022)

    Summary. By 2022, more sites had been established, frequently clustering near the 2002 sites (see map above). The number of sites increased 3.75-fold from 2002 to 2022 (rising from 15,978 to 59,918 sites; see histrogram and table in Question 4).

    # Map
    #scales::hue_pal()(2)
    pal <- leaflet::colorFactor(c("#F8766D", "#00BFC4"), domain = levels(df$Year))
    
    leaflet(df) %>%
      addMapPane("left",  zIndex = 410) %>% 
      addMapPane("right", zIndex = 410) %>% 
    
      addProviderTiles('OpenStreetMap',
                   options = pathOptions(pane = "left"),
                   layerId = "layer_2002") %>%
      addProviderTiles('OpenStreetMap',
                   options = pathOptions(pane = "right"),
                   layerId = "layer_2022") %>%
    
      addCircles(data = df %>% filter(Year == 2002),
                 lat = ~Site.Latitude,
                 lng = ~Site.Longitude,
                 fillColor = "#F8766D",
                 color = "black",
                 stroke = TRUE,
                 weight = 0.2,
                 opacity = 0.5,
                 fillOpacity = 1, 
                 radius = 100,
                 options = pathOptions(pane = "left"),
                 group = "2002") %>%
      addCircles(data = df %>% filter(Year == 2022),
                 lat = ~Site.Latitude,
                 lng = ~Site.Longitude,
                 fillColor = "#00BFC4",
                 color = "black",
                 stroke = TRUE,
                 weight = 0.2,
                 opacity = 0.5,
                 fillOpacity = 1, 
                 radius = 100, 
                 options = pathOptions(pane = "right"),
                 group = "2022") %>%
    
      addLegend("topright", 
                title = "Location of Monitor Sites",
                pal = pal, values = ~Year, labels = ~Year
                ) %>%
    
      addLayersControl(overlayGroups = ~c("2002", "2022"),
                   options = layersControlOptions(collapsed = F)) %>%
    
      addSidebyside(
          layerId = "sidecontrols",
          leftId = "layer_2002",
          rightId = "layer_2022")
    #https://rdrr.io/cran/leaflet.extras2/man/addSidebyside.html
  4. (10 points) Check for any data issues such as missing or implausible values of PM in the combined dataset. Calculate the proportion of missing/implausible values for each year and report any temporal patterns you see in these observations.

    Missing values. There are no missing values in either year.

    Implausible values. In 2022, 215 out of 59,918 observations (0.36%) were implausible (defined as PM2.5 < 0).

    Temporal Patterns. The number of implausible observations increased in 2022, although the count remained very small overall).

    summary2 <- df %>% 
      group_by(Year) %>%
      summarise(
        `# Total`          = n(),
        `# Missing`        = sum(is.na(PM2.5)),
        `# Implausible`    = sum(PM2.5 < 0, na.rm = TRUE)
      ) %>%
      mutate(    
        `% Missing`     = paste0(round((`# Missing` / `# Total`) * 100, 2), "%"),
        `% Implausible` = paste0(round((`# Implausible` / `# Total`) * 100, 2), "%")) %>%
      select(`# Total`, `# Missing`, `% Missing`, `# Implausible`, `% Implausible`)
    
    kable(as.data.frame(summary2), align = "c")
    # Total # Missing % Missing # Implausible % Implausible
    15976 0 0% 0 0%
    59918 0 0% 215 0.36%
    # Count of PM 2.5 Concentration
    df$Year <- factor(df$Year, levels = c("2022", "2002"))  # 2002 will be drawn last (on top)
    ggplot(df, aes(x = PM2.5, fill = Year, color = Year)) +
      geom_histogram(bins = 100, alpha = 0.5, position = "identity") + 
      labs(title = "Histrogram of Daily PM 2.5 Concentration", 
           x = "PM2.5 Concentration",
           y = "Count") +
      scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      scale_fill_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_classic()

    df$Year <- factor(df$Year, levels = c("2002", "2022"))  # 2002 will be drawn last (on top)
  5. (30 points) Explore the main question of interest at three different levels of spatial resolution. Create exploratory plots (e.g. boxplots, histograms, line plots, violin plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.

  • Level 1: State. Examine the primary question for the entire state.

  • Level 2: County. Examine the primary question for every county in California.

  • Level 3: City. Restrict the data to sites in Los Angeles county and examine the primary question for every site.

    Level 1: State

    Summary. The mean of daily PM 2.5 in 2022 (mean: 8.41; sd: 7.64) is lower than than 2002 (mean: 16.12; sd: 13.87), despite more outliers.

    summary3 <- df %>% 
      group_by(Year) %>%
      summarise(
        n             = n(),
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = T), 2)) %>%
      mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
      select(Year, `Mean (SD)`) %>%
      arrange(Year)
    kable(as.data.frame(summary3), align = "c")
    Year Mean (SD)
    2002 16.12 (13.87)
    2022 8.41 (7.64)
    ggplot(df, aes_string(x = "Year", y = "PM2.5", fill = "Year", color = "Year")) +
      geom_boxplot(color = "black", size = 0.2) +
      labs(title = paste0(" Daily PM2.5 Concentration (2002 vs 2022) in California"),
           x = "Year",
           y = paste0("Daily PM2.5 Concentration")) +
      scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_minimal() +
      theme(legend.position = "bottom",
            legend.box = "horizontal",
            legend.justification = "center")
    Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
    ℹ Please use tidy evaluation idioms with `aes()`.
    ℹ See also `vignette("ggplot2-in-packages")` for more information.
    Warning: No shared levels found between `names(values)` of the manual scale and the
    data's colour values.

    Level 2: County

    Summary. The mean of daily PM 2.5 for most counties in California in 2022 are lower than that of 2000, except for Trinity, Siskiyou, Mono, and Del Norte.

    summary4 <- df %>% 
      group_by(County, Year) %>%
      summarise(
        n             = n(),
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = TRUE), 2)) %>%
      mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
      select(Year, County, `Mean (SD)`) %>%
      pivot_wider(
        names_from = Year, 
        values_from = `Mean (SD)`, 
        names_prefix = "Mean (SD) in ") %>%
      select(County, `Mean (SD) in 2002`, `Mean (SD) in 2022`) 
    `summarise()` has grouped output by 'County'. You can override using the
    `.groups` argument.
    kable(as.data.frame(summary4), align = "c")
    County Mean (SD) in 2002 Mean (SD) in 2022
    Alameda 14.25 (11.43) 8.2 (4.95)
    Butte 14.76 (11.66) 6.19 (5.79)
    Calaveras 9.9 (6.5) 6.04 (4.1)
    Colusa 11.73 (10.01) 7.61 (4.76)
    Contra Costa 15.09 (14.47) 8.24 (4.93)
    Del Norte 3.82 (2.56) 4.97 (3.58)
    El Dorado 4.91 (4.07) 4.07 (9.18)
    Fresno 19.93 (18.91) 10.19 (8.3)
    Glenn NA 5.34 (4.98)
    Humboldt 7.79 (4.59) 6.76 (4.35)
    Imperial 12.71 (7.7) 9.55 (6.39)
    Inyo 7.14 (10.49) 5.55 (5.47)
    Kern 23 (18.72) 9.88 (9.91)
    Kings 24.7 (19.06) 14.4 (10.78)
    Lake 6.5 (10.88) 4.26 (3.12)
    Los Angeles 19.66 (11.88) 10.97 (5.24)
    Madera NA 10.39 (7.26)
    Marin 7.31 (4.87) 6.54 (4.17)
    Mariposa 9.48 (5.97) 9.47 (17.78)
    Mendocino 8.85 (8.79) 10.14 (5.19)
    Merced 21.54 (15.32) 10.06 (7.32)
    Modoc 5 (0) NA
    Mono 2.68 (2.57) 4.43 (6.09)
    Monterey 8.97 (4.53) 4.75 (3.21)
    Nevada 7.36 (3.41) 7.59 (15.86)
    Orange 17.83 (10.55) 9.24 (4.43)
    Placer 13.1 (10.24) 7.28 (15.48)
    Plumas 12.86 (8.94) 11.23 (10.33)
    Riverside 22.37 (16.1) 9.13 (6.27)
    Sacramento 15.17 (14.51) 8.25 (7.54)
    San Benito 4.99 (2.92) 4.71 (2.69)
    San Bernardino 16.3 (13.08) 9.42 (5.19)
    San Diego 15.27 (7.65) 8.3 (4.36)
    San Francisco 15.31 (14.21) 6.78 (5.14)
    San Joaquin 16.76 (12.26) 7.9 (7.81)
    San Luis Obispo 8.88 (5.15) 7.1 (4.52)
    San Mateo 12.24 (8.91) 6.8 (4.94)
    Santa Barbara 7.04 (3.92) 6.06 (3.51)
    Santa Clara 14.83 (10.65) 8.77 (5.59)
    Santa Cruz 8.55 (4.34) 5.23 (3.74)
    Shasta 7.25 (8.38) 5.04 (4.67)
    Siskiyou 2.69 (2.06) 7.6 (17.3)
    Solano 15.48 (14.15) 7.9 (5.46)
    Sonoma 11.13 (9.5) 6.38 (4.29)
    Stanislaus 19.66 (17.09) 12.12 (8.42)
    Sutter 12.82 (10.2) 10.04 (6.6)
    Tehama NA 5.87 (4.5)
    Trinity 2.78 (2.4) 10.73 (23.1)
    Tulare 23.1 (16.15) 10.41 (7.53)
    Ventura 13.4 (7.24) 6.85 (3.83)
    Yolo 10.58 (8.86) 5.44 (5.29)
    df_couty <- df %>% group_by(Year, County) %>% 
      summarise(
        n             = n(),
        mean_PM2.5    = mean(PM2.5, na.rm = T), 
        sd_PM2.5      = sd(PM2.5, na.rm = T),
      ) %>%
      mutate(se_PM2.5 = sd_PM2.5 / sqrt(n))
    `summarise()` has grouped output by 'Year'. You can override using the
    `.groups` argument.
    ggplot(df_couty, 
           aes(y = County, x = mean_PM2.5, 
               xmin = mean_PM2.5 - 1.96 * se_PM2.5, 
               xmax = mean_PM2.5 + 1.96 * se_PM2.5, 
               color = Year, group = Year)) +
      geom_pointrange(size = 0.3, fatten = 1.2) +
      labs(title = paste0("Daily PM2.5 Concentration (2002 vs 2022), by County"),
           x = "Daily PM2.5 Concentration (95% CI)", y = "County",) +
      scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_minimal() +
      theme(legend.position = "bottom",
            legend.box = "horizontal",
            legend.justification = "center")

    Level 3: Sites in LA County

    Summary. The mean of daily PM 2.5 for all cities in Los Angeles county in 2022 are lower than that of 2000

      summary5 <- df %>% 
               filter(County == "Los Angeles", 
                      Year %in% c("2002", "2022"),
                      Local.Site.Name != "") %>%
          group_by(Year, Local.Site.Name) %>%
          summarise(
            n             = n(),
            mean          = round(mean(PM2.5, na.rm = T), 2), 
            sd            = round(sd(PM2.5, na.rm = TRUE), 2)) %>%
          mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
          select(Year, Local.Site.Name, `Mean (SD)`) %>%
          pivot_wider(
            names_from = Year, 
            values_from = `Mean (SD)`, 
            names_prefix = "Mean (SD) in ") %>%
          mutate(`Local Site Name` = Local.Site.Name) %>%
          select(`Local Site Name`, `Mean (SD) in 2002`, `Mean (SD) in 2022`) %>%
          arrange(`Local Site Name`) 
    `summarise()` has grouped output by 'Year'. You can override using the
    `.groups` argument.
      kable(as.data.frame(summary5), align = "c")
    Local Site Name Mean (SD) in 2002 Mean (SD) in 2022
    Azusa 20.76 (12.11) 9.72 (4.39)
    Burbank 23.97 (12.74) NA
    Compton NA 12.99 (6.22)
    Glendora NA 8.42 (5.47)
    Lancaster-Division Street 10.38 (4.47) 7.52 (2.48)
    Lebec 4.82 (2.78) 3.5 (1.65)
    Long Beach (North) 19.47 (10.39) 9.92 (3.27)
    Long Beach (South) NA 11.97 (4.32)
    Long Beach-Route 710 Near Road NA 13.02 (5.61)
    Los Angeles-North Main Street 21.97 (11.69) 11.58 (4.57)
    Lynwood 23.35 (11.96) NA
    North Hollywood (NOHO) NA 13.02 (4.75)
    Pasadena 20.29 (11.14) 9.09 (3.68)
    Pico Rivera #2 NA 11.45 (5.93)
    Reseda 18.85 (10.65) 10.72 (4.56)
    Santa Clarita NA 9.14 (3.9)
    Signal Hill (LBSH) NA 8.85 (4.42)
      ggplot(df %>% 
               filter(County == "Los Angeles", 
                      Year %in% c("2002", "2022"),
                      Local.Site.Name != ""), 
           aes(x = PM2.5,
               y = Local.Site.Name,
               fill = Year)) +
      geom_violin(alpha = 0.6, scale = "width", 
                  position = position_dodge(width = 0.8), trim = FALSE) +
      labs(
        title = "Daily PM2.5 Concentration in LA County",
        subtitle = "By monitoring sites (2002 vs 2022)",
        x = "Daily PM2.5 Concentration",
        y = "Site",
        fill = "Year") +
      scale_color_manual(name = "Year",
                             values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      scale_fill_manual(name = "Year",
                             values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_minimal() +
      theme(
        legend.position = "bottom",
        legend.box = "horizontal",
        legend.justification = "center"
      )
    Warning: No shared levels found between `names(values)` of the manual scale and the
    data's colour values.

Reminder: after you upload your final rendered document to GitHub, you should download it to make sure that it looks right! If you haven’t included embed-resources: true in the YAML header, none of your figures will show up!